In [1]:
from pyspark.ml import Pipeline
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionWithSGD, LogisticRegressionModel
from pyspark.ml.feature import IDF, HashingTF, Tokenizer, Tokenizer, StopWordsRemover, RegexTokenizer, CountVectorizer, StringIndexer
from pyspark.sql import Row
from pyspark.sql.functions import UserDefinedFunction, col, isnull, isnan, when, count
from pyspark.sql.types import *
from pyspark.ml.clustering import LDA
Starting Spark application
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
2application_1531680290955_0008pyspark3idleLinkLink✔
SparkSession available as 'spark'.

Loading data from Blob Storage into Spark

In [2]:
yelp_review = spark.read.csv('wasb://main@yelpproject.blob.core.windows.net/yelp_review_tab.csv', sep ='\t', header=True, mode="DROPMALFORMED", inferSchema=True)
yelp_review.write.saveAsTable("yelp_review", mode="overwrite")
yelp_review.printSchema()
root
 |-- _c0: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: integer (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: integer (nullable = true)
 |-- funny: integer (nullable = true)
 |-- cool: integer (nullable = true)
In [3]:
yelp_business = spark.read.csv('wasb://main@yelpproject.blob.core.windows.net/yelp_business.csv', sep =',', header=True, mode="DROPMALFORMED", inferSchema=True)
yelp_business.write.saveAsTable("yelp_business", mode="overwrite")
yelp_business.printSchema()
root
 |-- business_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- stars: string (nullable = true)
 |-- review_count: string (nullable = true)
 |-- is_open: double (nullable = true)
 |-- categories: string (nullable = true)
In [4]:
%%sql
SHOW TABLES
In [1]:
from pyspark.ml import Pipeline
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionWithSGD, LogisticRegressionModel
from pyspark.ml.feature import IDF, HashingTF, Tokenizer, Tokenizer, StopWordsRemover, RegexTokenizer, CountVectorizer, StringIndexer
from pyspark.sql import Row
from pyspark.sql.functions import UserDefinedFunction, col, isnull, isnan, when, count
from pyspark.sql.types import *
from pyspark.ml.clustering import LDA
Starting Spark application
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
2application_1531680290955_0008pyspark3idleLinkLink✔
SparkSession available as 'spark'.

Loading data from Blob Storage into Spark

In [2]:
yelp_review = spark.read.csv('wasb://main@yelpproject.blob.core.windows.net/yelp_review_tab.csv', sep ='\t', header=True, mode="DROPMALFORMED", inferSchema=True)
yelp_review.write.saveAsTable("yelp_review", mode="overwrite")
yelp_review.printSchema()
root
 |-- _c0: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: integer (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: integer (nullable = true)
 |-- funny: integer (nullable = true)
 |-- cool: integer (nullable = true)
In [3]:
yelp_business = spark.read.csv('wasb://main@yelpproject.blob.core.windows.net/yelp_business.csv', sep =',', header=True, mode="DROPMALFORMED", inferSchema=True)
yelp_business.write.saveAsTable("yelp_business", mode="overwrite")
yelp_business.printSchema()
root
 |-- business_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- stars: string (nullable = true)
 |-- review_count: string (nullable = true)
 |-- is_open: double (nullable = true)
 |-- categories: string (nullable = true)
In [4]:
%%sql
SHOW TABLES
database tableName isTemporary
0 default cities_df False
1 default hivesampletable False
2 default yelp_business False
3 default yelp_review False

Data exploration

In [5]:
# How many businesses in the dataset?
yelp_business.count()
127210
In [6]:
# Is there any null values in yelp_review?
yelp_business.select([count(when(isnull(c), c)).alias(c) for c in yelp_business.columns]).show()
+-----------+----+------------+-------+----+-----+-----------+--------+---------+-----+------------+-------+----------+
|business_id|name|neighborhood|address|city|state|postal_code|latitude|longitude|stars|review_count|is_open|categories|
+-----------+----+------------+-------+----+-----+-----------+--------+---------+-----+------------+-------+----------+
|          0|   0|       75939|      0|   1|    0|        590|       1|        1|    0|           0|      0|         0|
+-----------+----+------------+-------+----+-----+-----------+--------+---------+-----+------------+-------+----------+

Subseting data

In [5]:
# Subset all restaurants
yelp_restaurants = yelp_business.filter(yelp_business.categories.rlike('Food|Restaurants|Bars|Bakeries'))
yelp_restaurants.registerTempTable('yelp_restaurants')
yelp_restaurants.count()
58588
In [8]:
%%sql
select * from yelp_restaurants LIMIT 10
business_id name neighborhood address city state postal_code latitude longitude stars review_count is_open categories
0 PfOCPjBrlQAnz__NXj9h_w """Brick House Tavern + Tap""" NaN """581 Howe Ave""" Cuyahoga Falls OH 44221 41.119535 -81.475690 3.5 116 1.0 American (New);Nightlife;Bars;Sandwiches;Ameri...
1 o9eMRCWt5PkpLDE0gOPtcQ """Messina""" NaN """Richterstr. 11""" Stuttgart BW 70567 48.727200 9.147950 4.0 5 1.0 Italian;Restaurants
2 EsMcGiZaQuG1OOvL9iUFug """Any Given Sundae""" NaN """2612 Brandt School Rd""" Wexford PA 15090 40.615102 -80.091349 5.0 15 1.0 Coffee & Tea;Ice Cream & Frozen Yogurt;Food
3 XOSRcvtaKc_Q5H1SAzN20A """East Coast Coffee""" NaN """737 West Pike St""" Houston PA 15342 40.241548 -80.212815 4.5 3 0.0 Breakfast & Brunch;Gluten-Free;Coffee & Tea;Fo...
4 xcgFnd-MwkZeO5G2HQ0gAQ """T & T Bakery and Cafe""" Markham Village """35 Main Street N""" Markham ON L3P 1X3 43.875177 -79.260153 4.0 38 1.0 Bakeries;Bagels;Food
5 fNMVV_ZX7CJSDWQGdOM8Nw """Showmars Government Center""" Uptown """600 E 4th St""" Charlotte NC 28202 35.221647 -80.839345 3.5 7 1.0 Restaurants;American (Traditional)
6 l09JfMeQ6ynYs5MCJtrcmQ """Alize Catering""" Yonge and Eglinton """2459 Yonge St""" Toronto ON M4P 2H6 43.711399 -79.399339 3.0 12 0.0 Italian;French;Restaurants
7 lHYiCS-y8AFjUitv6MGpxg """Starbucks""" Liberty Village """85 Hanna Avenue""" Toronto ON M6K 3S3 43.639863 -79.419533 4.0 21 1.0 Food;Coffee & Tea
8 VSGcuYDV3q-AAZ9ZPq4fBQ """Sportster's""" The Danforth """1430 Danforth Avenue""" Toronto ON M4J 1N4 43.682867 -79.326964 2.5 7 1.0 Bars;Sports Bars;Nightlife
9 1K4qrnfyzKzGgJPBEcJaNQ """Chula Taberna Mexicana""" Leslieville """1058 Gerrard Street E""" Toronto ON M4M 3A6 43.669256 -79.335902 3.5 39 1.0 Tiki Bars;Nightlife;Mexican;Restaurants;Bars
In [10]:
%%sql
/* What are the top 3 cities with highest number of restourants? */
select city, count(*) as N from yelp_restaurants group by city order by N DESC  LIMIT 3
city N
0 Toronto 8525
1 Las Vegas 5425
2 Montréal 3957

Subset the data for TOP 3 cities

In [6]:
subset_toronto = spark.sql("select text, review_id, date from yelp_review where business_id IN (select business_id from yelp_restaurants where city IN ('Toronto'))")
subset_toronto.registerTempTable('subset_toronto')
# Number of reviews for Toronto
subset_toronto.count()
136201
In [7]:
subset_las_vegas = spark.sql("select text, review_id, date from yelp_review where business_id IN (select business_id from yelp_restaurants where city IN ('Las Vegas'))")
subset_las_vegas.registerTempTable('subset_las_vegas')
# Number of reviews for Las Vegas
subset_las_vegas.count()
384761
In [8]:
subset_montreal = spark.sql("select text, review_id, date from yelp_review where business_id IN (select business_id from yelp_restaurants where city IN ('Montréal'))")
subset_montreal.registerTempTable('subset_montreal')
# Number of reviews for Montreal
subset_montreal.count()
53412
In [14]:
%%sql
show tables
database tableName isTemporary
0 default hivesampletable False
1 default yelp_business False
2 default yelp_review False
3 subset_las_vegas True
4 subset_montreal True
5 subset_toronto True
6 yelp_restaurants True
In [15]:
subset_toronto.head(1)
[Row(text='"Changed my rating from 2 stars to 1 star, due to the attitude of either the staff or friends of Joe\'s Buffet Palace. I don\'t know why 8 of them think my review is ""funny"" (are they trying to troll me?) but I was not amused by the bad food experience I had here. I hope they continue to make fake 5-star reviews of this place because that is way funnier to me. I will never step foot in Joe\'s Buffet Palace again and will tell everyone I know to avoid it."', review_id='Xwz2i64CI0SE5wOgG9QB-w', date=datetime.datetime(2012, 4, 13, 0, 0))]
In [ ]:
import pandas as pd
cities_df = pd.DataFrame()

cities = ['Toronto', 'Las Vegas', 'Montréal']
datasets = [subset_toronto, subset_las_vegas, subset_montreal]
for c in range(len(cities)):
    city = cities[c]
    dataset = datasets[c]
    
    tokenizer = RegexTokenizer(inputCol='text', outputCol='tokenized', pattern='\s+|[,.";()]')
    featurizedData0 = tokenizer.transform(dataset)

    stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='words') #stopWords=["b"]
    featurizedData1 = stopwords.transform(featurizedData0)

    # Term Frequency Vectorization  - Option 2 (CountVectorizer)    : 
    cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", vocabSize = 2000)
    cvmodel = cv.fit(featurizedData1)
    featurizedData = cvmodel.transform(featurizedData1)

    vocab = cvmodel.vocabulary
    vocab_broadcast = sc.broadcast(vocab)

    # TFIDF
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData) 

    # Generate 10 topics for each city
    lda = LDA(k=10, seed=1234,  optimizer='online', featuresCol='features')
    ldamodel = lda.fit(rescaledData)
    
    # Load topics
    topicIndices = ldamodel.describeTopics(maxTermsPerTopic=5)
    vocablist = cvmodel.vocabulary
    
    # Preprocess model resuts
    topics_words = topicIndices.rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\  # map tokens from vocabulary 
       .collect()
    
    weights_map = topicIndices.rdd\
           .map(lambda row: row['termWeights']).collect()
    list(weights_map)

    # Merge terms and weights into one dataframe
    df_topics = pd.DataFrame(
        {'Terms': topics_words,
         'Weights': weights_map
        })
    
    # Add additional inforamtion: City and Number of Topic
    for i in range(len(df_topics)):
        for j in range(len(df_topics['Terms'][i])):
            row = [{'Topic': i+1, 'Term': df_topics['Terms'][i][j], 'Weight': df_topics['Weights'][i][j], 'City': city}]
            cities_df = cities_df.append(row, ignore_index=True)
In [10]:
pd.options.display.max_rows=300
print(cities_df)
          City        Term  Topic    Weight
0      Toronto       pizza      1  0.016716
1      Toronto       ramen      1  0.010416
2      Toronto        eggs      1  0.008036
3      Toronto       bacon      1  0.006985
4      Toronto       fries      1  0.006122
5      Toronto      coffee      2  0.012492
6      Toronto         tea      2  0.011473
7      Toronto       store      2  0.006716
8      Toronto         get      2  0.005462
9      Toronto       place      2  0.005394
10     Toronto        food      3  0.006700
11     Toronto        good      3  0.006615
12     Toronto       place      3  0.006451
13     Toronto       great      3  0.006296
14     Toronto   breakfast      3  0.006066
15     Toronto       great      4  0.010428
16     Toronto     service      4  0.006970
17     Toronto        food      4  0.006837
18     Toronto  atmosphere      4  0.006018
19     Toronto        wine      4  0.005877
20     Toronto     minutes      5  0.008381
21     Toronto          us      5  0.008012
22     Toronto       order      5  0.006432
23     Toronto        food      5  0.006317
24     Toronto        time      5  0.006211
25     Toronto          us      6  0.007896
26     Toronto        food      6  0.006179
27     Toronto     service      6  0.006043
28     Toronto    customer      6  0.005761
29     Toronto       asked      6  0.005628
30     Toronto     chicken      7  0.013164
31     Toronto        rice      7  0.008658
32     Toronto        soup      7  0.008040
33     Toronto       spicy      7  0.007311
34     Toronto       fried      7  0.006963
35     Toronto       cream      8  0.017626
36     Toronto         ice      8  0.016885
37     Toronto   chocolate      8  0.010074
38     Toronto        cake      8  0.009935
39     Toronto       sweet      8  0.006235
40     Toronto         pho      9  0.011126
41     Toronto      burger      9  0.008292
42     Toronto     chicken      9  0.007624
43     Toronto        beef      9  0.006002
44     Toronto        pork      9  0.005612
45     Toronto       place     10  0.007603
46     Toronto    location     10  0.006967
47     Toronto         bar     10  0.006590
48     Toronto       great     10  0.005867
49     Toronto        good     10  0.005848
50   Las Vegas       sushi      1  0.011330
51   Las Vegas           &      1  0.007256
52   Las Vegas       steak      1  0.005983
53   Las Vegas       salad      1  0.004997
54   Las Vegas        good      1  0.004936
55   Las Vegas     burgers      2  0.008214
56   Las Vegas         dim      2  0.007210
57   Las Vegas         sum      2  0.006725
58   Las Vegas      burger      2  0.006477
59   Las Vegas        good      2  0.006069
60   Las Vegas       pizza      3  0.020863
61   Las Vegas      always      3  0.010976
62   Las Vegas       great      3  0.009376
63   Las Vegas    friendly      3  0.008517
64   Las Vegas        love      3  0.008352
65   Las Vegas       great      4  0.009927
66   Las Vegas       vegas      4  0.008146
67   Las Vegas        food      4  0.007817
68   Las Vegas     amazing      4  0.007642
69   Las Vegas     service      4  0.007486
70   Las Vegas   breakfast      5  0.008176
71   Las Vegas      buffet      5  0.006748
72   Las Vegas        eggs      5  0.006710
73   Las Vegas   chocolate      5  0.005840
74   Las Vegas        good      5  0.005676
75   Las Vegas     lobster      6  0.010598
76   Las Vegas        soup      6  0.010356
77   Las Vegas     chicken      6  0.008993
78   Las Vegas        rice      6  0.008741
79   Las Vegas     ordered      6  0.007910
80   Las Vegas        line      7  0.007525
81   Las Vegas       drive      7  0.006749
82   Las Vegas        wait      7  0.005878
83   Las Vegas         get      7  0.005754
84   Las Vegas        taco      7  0.005159
85   Las Vegas      burger      8  0.014047
86   Las Vegas       fries      8  0.010258
87   Las Vegas      cheese      8  0.009573
88   Las Vegas     chicken      8  0.008377
89   Las Vegas       sauce      8  0.005758
90   Las Vegas          us      9  0.011784
91   Las Vegas     minutes      9  0.007803
92   Las Vegas       order      9  0.007407
93   Las Vegas       asked      9  0.007050
94   Las Vegas        said      9  0.006904
95   Las Vegas        room     10  0.007113
96   Las Vegas       hotel     10  0.006595
97   Las Vegas         bar     10  0.006340
98   Las Vegas       strip     10  0.006204
99   Las Vegas       place     10  0.006148
100   Montréal        gras      1  0.015071
101   Montréal        foie      1  0.014215
102   Montréal   chocolate      1  0.012642
103   Montréal        rich      1  0.006829
104   Montréal     lobster      1  0.006413
105   Montréal          de      2  0.031393
106   Montréal          et      2  0.027222
107   Montréal          le      2  0.024552
108   Montréal          la      2  0.021022
109   Montréal          un      2  0.019353
110   Montréal      bagels      3  0.019748
111   Montréal       bagel      3  0.017836
112   Montréal         pho      3  0.017467
113   Montréal        meat      3  0.014684
114   Montréal      smoked      3  0.014048
115   Montréal          us      4  0.011371
116   Montréal     minutes      4  0.007184
117   Montréal        food      4  0.006491
118   Montréal       order      4  0.005953
119   Montréal       asked      4  0.005918
120   Montréal        good      5  0.006617
121   Montréal        meat      5  0.005530
122   Montréal        beef      5  0.005314
123   Montréal       place      5  0.005286
124   Montréal    montreal      5  0.005180
125   Montréal      coffee      6  0.009671
126   Montréal       great      6  0.008989
127   Montréal       place      6  0.007131
128   Montréal        good      6  0.006841
129   Montréal        best      6  0.006645
130   Montréal       great      7  0.008221
131   Montréal   breakfast      7  0.007989
132   Montréal      brunch      7  0.007599
133   Montréal        food      7  0.006790
134   Montréal    friendly      7  0.006211
135   Montréal         tea      8  0.007554
136   Montréal         bun      8  0.007414
137   Montréal       latte      8  0.007186
138   Montréal         bar      8  0.007078
139   Montréal        good      8  0.006722
140   Montréal      burger      9  0.008815
141   Montréal       sushi      9  0.007186
142   Montréal        food      9  0.006334
143   Montréal     chicken      9  0.006261
144   Montréal       place      9  0.005796
145   Montréal     chicken     10  0.008390
146   Montréal        good     10  0.006735
147   Montréal     poutine     10  0.006380
148   Montréal        like     10  0.005741
149   Montréal       fries     10  0.005347
In [11]:
# Create a new Spark DataFrame
schema = StructType([StructField('City', StringType(), True), StructField('Term', StringType(), True), \
                     StructField('Topic', IntegerType(), True), StructField('Weight', IntegerType(), True)])
cities_DF = spark.createDataFrame(sc.emptyRDD(), schema)
In [12]:
# Convert Pandas DataFrame into Spark DataFrame (in pandas version > 0.19 it is done in one line)
for i in range(len(cities_df)):
    newRow = spark.createDataFrame([Row(City=cities_df.iloc[i]['City'], Topic=cities_df.iloc[i]['Topic'].tolist(),\
                                        Term=cities_df.iloc[i]['Term'], Weight=cities_df.iloc[i]['Weight'].tolist() )])
    cities_DF = cities_DF.union(newRow)
In [13]:
cities_DF.head(1)
[Row(City='Toronto', Term='pizza', Topic=1, Weight=0.016716298745811146)]
In [14]:
cities_DF.write.saveAsTable("cities_DF", mode="overwrite")
cities_DF.count()
150
In [15]:
%%sql
show tables
database tableName isTemporary
0 default cities_df False
1 default hivesampletable False
2 default yelp_business False
3 default yelp_review False
4 subset_las_vegas True
5 subset_montreal True
6 subset_toronto True
7 yelp_restaurants True

LDA clustering for business categories

In [16]:
# Subset the data from business table 
BusinessData = yelp_business.select(yelp_business.categories, yelp_business.business_id, yelp_business.name)
BusinessData.registerTempTable('BusinessData')
BusinessData.head(1)
[Row(categories="Hair Stylists;Hair Salons;Men's Hair Salons;Blow Dry/Out Services;Hair Extensions;Beauty & Spas", business_id='He-G7vWjzVUysIKrfNbPUQ', name='"""Stephen Szabo Salon"""')]
In [17]:
# Tokenizing text
tokenizer = RegexTokenizer(inputCol='categories', outputCol='tokenized', pattern='\s+|[,.";()]')
featurizedData0 = tokenizer.transform(BusinessData)

stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='words')
featurizedData1 = stopwords.transform(featurizedData0)
 
# Term Frequency Vectorization: 
cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", vocabSize = 1000)
cvmodel = cv.fit(featurizedData1)
featurizedData = cvmodel.transform(featurizedData1)
 
vocab = cvmodel.vocabulary
vocab_broadcast = sc.broadcast(vocab)
 
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData) # TFIDF
In [19]:
# Generate 25 Topics:
lda = LDA(k=25, seed=1234,  optimizer='online', featuresCol="features")
ldamodel = lda.fit(rescaledData)
In [20]:
ll = ldamodel.logLikelihood(rescaledData)
lp = ldamodel.logPerplexity(rescaledData)

print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))
The lower bound on the log likelihood of the entire corpus: -11556902.025592694
The upper bound on perplexity: 4.185029986601038
In [21]:
ldamodel.isDistributed()
False
In [22]:
ldatopics = ldamodel.describeTopics()
ldatopics.show(25)
+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[51, 6, 58, 72, 9...|[0.11221426670643...|
|    1|[20, 5, 37, 35, 4...|[0.07692465479914...|
|    2|[16, 129, 48, 99,...|[0.06926009550134...|
|    3|[28, 53, 119, 153...|[0.22136805439616...|
|    4|[142, 148, 163, 1...|[0.06473304219797...|
|    5|[43, 22, 105, 3, ...|[0.06831633650490...|
|    6|[31, 34, 3, 114, ...|[0.13964217564008...|
|    7|[113, 14, 140, 10...|[0.05955085365637...|
|    8|[32, 41, 74, 13, ...|[0.08912716556999...|
|    9|[17, 38, 39, 26, ...|[0.13455815777818...|
|   10|[50, 73, 3, 71, 2...|[0.15113119770760...|
|   11|[24, 25, 109, 201...|[0.13847020934105...|
|   12|[3, 29, 40, 1, 33...|[0.11725222116549...|
|   13|[68, 71, 15, 20, ...|[0.05596693038739...|
|   14|[5, 2, 83, 133, 2...|[0.08818045054138...|
|   15|[76, 52, 61, 102,...|[0.09700588055615...|
|   16|[7, 8, 21, 23, 0,...|[0.12025573319319...|
|   17|[10, 12, 2, 0, 93...|[0.14568510287689...|
|   18|[47, 66, 75, 1, 1...|[0.11967617178241...|
|   19|[56, 77, 146, 3, ...|[0.14899612570985...|
|   20|[19, 22, 135, 91,...|[0.12408254685498...|
|   21|[42, 46, 1, 139, ...|[0.12653463304892...|
|   22|[18, 11, 16, 65, ...|[0.11524058411067...|
|   23|[6, 9, 80, 86, 85...|[0.15309688013229...|
|   24|[49, 30, 81, 55, ...|[0.11113403052155...|
+-----+--------------------+--------------------+
In [23]:
topicIndices = ldamodel.describeTopics(maxTermsPerTopic=5)
vocablist = cvmodel.vocabulary
In [24]:
topics_rdd = topicIndices.rdd
type(topics_rdd)
topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
       .collect()
    
for index, topic in enumerate(topics_words):
    print("topic: ", index)
    print(topic)
    print("------------------")
topic:  0
['chinese', 'bars', 'cafes', 'pubs', 'nightlife']
------------------
topic:  1
['arts', 'home', 'garden', 'entertainment', 'shopping']
------------------
topic:  2
['repair', 'gas', 'stations', 'convenience', 'automotive']
------------------
topic:  3
['pet', 'pets', 'delis', 'groomers', 'sandwiches']
------------------
topic:  4
['pool', 'transportation', 'wash', 'detailing', 'storage']
------------------
topic:  5
['specialty', 'health', 'dentists', 'food', 'medical']
------------------
topic:  6
['tea', 'coffee', 'food', 'photography', '&']
------------------
topic:  7
['department', 'stores', 'air', 'drugstores', 'heating']
------------------
topic:  8
['pizza', 'italian', 'laundry', 'american', 'restaurants']
------------------
topic:  9
['hotels', 'estate', 'real', 'travel', 'services']
------------------
topic:  10
['grocery', 'seafood', 'food', 'schools', 'providers']
------------------
topic:  11
['life', 'active', 'parks', 'ethnic', 'specialty']
------------------
topic:  12
['food', 'fast', 'burgers', 'restaurants', 'sandwiches']
------------------
topic:  13
['education', 'schools', 'local', 'arts', 'entertainment']
------------------
topic:  14
['home', 'services', 'contractors', 'installation', 'bagels']
------------------
topic:  15
['beer', 'cleaning', 'wine', 'spirits', 'home']
------------------
topic:  16
['spas', 'beauty', 'hair', 'salons', '&']
------------------
topic:  17
['event', 'planning', 'services', '&', 'caterers']
------------------
topic:  18
['mexican', 'chicken', 'wings', 'restaurants', 'asian']
------------------
topic:  19
['bakeries', 'desserts', 'barbers', 'food', 'landscaping']
------------------
topic:  20
['medical', 'health', 'jewelry', 'doctors', 'centers']
------------------
topic:  21
['breakfast', 'brunch', 'restaurants', 'diners', 'american']
------------------
topic:  22
['auto', 'automotive', 'repair', 'dealers', 'tires']
------------------
topic:  23
['bars', 'nightlife', 'ice', 'yogurt', 'frozen']
------------------
topic:  24
['clothing', 'fashion', "women's", 'sports', 'shopping']
------------------
In [ ]: